Descrição do Problema

Tratamento dos dados


In [9]:
# Aux Functions

def labels_dictionary():
    file = open('data/communities.names_edit.txt', 'r')
    labels = dict()
    for index, line in enumerate(file):
        line = line[:-1]
        labels[index] = line # Allow to read the label by the index
        labels[line] = index # Allow to read the index by the label
    return labels  

def get_labels(labels_dic):
    labels = []
    for i in range (128):
        labels.append(labels_dic[i])
    return labels

def columns_to_remove(communities, missing_data_percentage):
    columns = []
    incomplete_values = communities.isnull().sum()
    incomplete_percent = (incomplete_values/communities.shape[0]*100)
    for i in range (communities.shape[1]):
        if incomplete_percent[i] > missing_data_percentage:
            columns.append(i)
    return columns




# incomplete_values = communities.isnull().sum()
# incomplete_values_percent = (incomplete_values/communities.shape[0]*100)
# print("Percent of Incomple values")
# print(list(map(lambda x: x>75, incomplete_values_percent)))

In [10]:
import pandas
import numpy

labels_dic = labels_dictionary()
communities = pandas.read_csv('data/communities.data.txt', sep=",", names=get_labels(labels_dic), encoding='utf-8')


communities = communities.replace('?', numpy.NaN)
col_to_remove = columns_to_remove(communities, 75) # Columns with more than 75% of missing data are removed


print("More than 75% incomplete:")
list_of_incomplete = list(map(lambda x: labels_dic[x], col_to_remove))
print(list_of_incomplete)
print("----\n")

X = communities.iloc[:, 0:127] # OR .drop(labels='ViolentCrimesPerPop numeric', axis=1)
X = X.drop(labels=list_of_incomplete, axis=1)
X = X.drop(labels=['communityname string'], axis=1) #Temporary, change later
print("X new shape: ", X.shape, "\n")
Y = communities.iloc[:, [127]]


# df = pandas.DataFrame(X, columns=list(set(X['communityname string'])) )
# dummies = pandas.get_dummies(df)
# X.join(dummies)
# print(X.shape)

# X_val = X.values
# Y_val = Y.values

#print(X_val)
#print(Y_val)


More than 75% incomplete:
['LemasSwornFT numeric', 'LemasSwFTPerPop numeric', 'LemasSwFTFieldOps numeric', 'LemasSwFTFieldPerPop numeric', 'LemasTotalReq numeric', 'LemasTotReqPerPop numeric', 'PolicReqPerOffic numeric', 'PolicPerPop numeric', 'RacialMatchCommPol numeric', 'PctPolicWhite numeric', 'PctPolicBlack numeric', 'PctPolicHisp numeric', 'PctPolicAsian numeric', 'PctPolicMinor numeric', 'OfficAssgnDrugUnits numeric', 'NumKindsDrugsSeiz numeric', 'PolicAveOTWorked numeric', 'PolicCars numeric', 'PolicOperBudg numeric', 'LemasPctPolicOnPatr numeric', 'LemasGangUnitDeploy numeric', 'PolicBudgPerPop numeric']
----

X new shape:  (1994, 104) 

* Removing all Rows With Missing Values


In [11]:
X_rem = X
X_rem.dropna(inplace=True, axis='columns')
X_rem = X_rem.values
Y_rem = Y.values
#print(X_rem.shape)


from sklearn import linear_model
lm = linear_model.LinearRegression()

from sklearn.cross_validation import  cross_val_predict
from sklearn import metrics

predictions = cross_val_predict(lm, X_rem, Y_rem, cv=6)

r2_score = metrics.r2_score(Y_rem, predictions)
print(r2_score)

mean_squared_error = metrics.mean_squared_error(Y_rem, predictions)
print(mean_squared_error)


0.652390305044
0.0188594742411

* Imputing Missing Values


In [12]:
from pandas import read_csv
from sklearn.preprocessing import Imputer

X_val = X.values
Y_val = Y.values
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_imp = imputer.fit_transform(X_val)


from sklearn import linear_model
lm = linear_model.LinearRegression()

from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics


predictions = cross_val_predict(lm, X_val, Y_val, cv=6)

r2_score = metrics.r2_score(Y_val, predictions)
print(r2_score)

mean_squared_error = metrics.mean_squared_error(Y_val, predictions)
print(mean_squared_error)


0.652390305044
0.0188594742411

* Using PCA to remove features


In [13]:
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn import linear_model
from sklearn.decomposition import KernelPCA

X_val = X.values
Y_val = Y.values
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_imp = imputer.fit_transform(X_val)


#kpca = KernelPCA(n_components=50, kernel='linear')
kpca = KernelPCA(n_components=50, kernel='poly', degree=3)
X_KPCA = kpca.fit_transform(X_val)



lm = linear_model.LinearRegression()


predictions = cross_val_predict(lm, X_KPCA, Y_val, cv=6)

r2_score = metrics.r2_score(Y_val, predictions)
print(r2_score)

mean_squared_error = metrics.mean_squared_error(Y_val, predictions)
print(mean_squared_error)


0.439494977767
0.0304100552493

In [ ]:


In [ ]: